(rank_ss <- site_html %>%
rvest::html_node(xpath = xpath_ss) %>%
rvest::html_table() %>%
dplyr::mutate(rank = "SS") %>%
tidyr::separate(`ポケモン`, into = c("name", "memo", "memo2")) %>%
dplyr::mutate(memo2 = dplyr::if_else(name == "アローラ", name, memo2),
name = dplyr::if_else(name == "アローラ", memo, name),
name = dplyr::if_else(!is.na(memo2),
paste(name, "(", memo2, "の姿)", sep = ""),
name),
memo = dplyr::if_else(!is.na(memo2), memo2, memo)) %>%
dplyr::select(rank, name, type = `タイプ`, memo, CP = `最大CP`,
skill = `おすすめわざ`) %>%
tidyr::separate(skill, c("skill", "type1", "skill1", "type2", "skill2"),
sep = "([【】])") %>%
dplyr::select(-skill)) %>%
DT::datatable()
(rank_s <- site_html %>%
rvest::html_node(xpath = xpath_s) %>%
rvest::html_table() %>%
dplyr::mutate(rank = "S") %>%
tidyr::separate(`ポケモン`, into = c("name", "memo", "memo2")) %>%
dplyr::mutate(memo2 = dplyr::if_else(name == "アローラ", name, memo2),
name = dplyr::if_else(name == "アローラ", memo, name),
name = dplyr::if_else(!is.na(memo2),
paste(name, "(", memo2, "の姿)", sep = ""),
name),
memo = dplyr::if_else(!is.na(memo2), memo2, memo)) %>%
dplyr::select(rank, name, type = `タイプ`, memo, CP = `最大CP`,
skill = `おすすめわざ`) %>%
tidyr::separate(skill, c("skill", "type1", "skill1", "type2", "skill2"),
sep = "([【】])") %>%
dplyr::select(-skill)) %>%
DT::datatable()
(rank_ap <- site_html %>%
rvest::html_node(xpath = xpath_ap) %>%
rvest::html_table() %>%
dplyr::mutate(rank = "A+") %>%
tidyr::separate(`ポケモン`, into = c("name", "memo", "memo2")) %>%
dplyr::mutate(memo2 = dplyr::if_else(name == "アローラ", name, memo2),
name = dplyr::if_else(name == "アローラ", memo, name),
name = dplyr::if_else(!is.na(memo2),
paste(name, "(", memo2, "の姿)", sep = ""),
name),
memo = dplyr::if_else(!is.na(memo2), memo2, memo)) %>%
dplyr::select(rank, name, type = `タイプ`, memo, CP = `最大CP`,
skill = `おすすめわざ`) %>%
tidyr::separate(skill, c("skill", "type1", "skill1", "type2", "skill2"),
sep = "([【】])") %>%
dplyr::select(-skill)) %>%
DT::datatable()
(rank_a <- site_html %>%
rvest::html_node(xpath = xpath_a) %>%
rvest::html_table() %>%
dplyr::mutate(rank = "A") %>%
tidyr::separate(`ポケモン`, into = c("name", "memo", "memo2")) %>%
dplyr::mutate(memo2 = dplyr::if_else(name == "アローラ", name, memo2),
name = dplyr::if_else(name == "アローラ", memo, name),
name = dplyr::if_else(!is.na(memo2),
paste(name, "(", memo2, "の姿)", sep = ""),
name),
memo = dplyr::if_else(!is.na(memo2), memo2, memo)) %>%
dplyr::select(rank, name, type = `タイプ`, memo, CP = `最大CP`,
skill = `おすすめわざ`) %>%
tidyr::separate(skill, c("skill", "type1", "skill1", "type2", "skill2"),
sep = "([【】])") %>%
dplyr::select(-skill)) %>%
DT::datatable()
(rank_all <- rank_ss %>%
dplyr::bind_rows(rank_s, rank_ap, rank_a)) %>%
DT::datatable()
cp_table <- rank_all %>%
dplyr::group_by(type) %>%
dplyr::summarise(CP = round(max(CP)))
rank_all %>%
dplyr::count(rank, type) %>%
tidyr::spread(key = rank, value = n) %>%
dplyr::left_join(cp_table, .) %>%
DT::datatable()
rank_all %>%
dplyr::group_by(type, rank) %>%
dplyr::summarise(MaxCP = max(CP), MeanCP = round(mean(CP))) %>%
ggplot2::ggplot(ggplot2::aes(x = rank, y = type)) +
ggplot2::geom_tile(ggplot2::aes(fill = MeanCP)) +
ggplot2::geom_text(ggplot2::aes(label = MaxCP), colour = "#FFFFFF") +
ggplot2::scale_fill_continuous(type = "viridis") +
ggplot2::labs(subtitle = "白文字の数値は最大CP")
rank_all %>%
dplyr::group_by(type, rank) %>%
dplyr::summarise(MaxCP = max(CP), MeanCP = round(mean(CP))) %>%
dplyr::mutate(text = dplyr::case_when(MaxCP > MeanCP ~ type, TRUE ~ "")) %>%
ggplot2::ggplot(ggplot2::aes(x = MeanCP, y = MaxCP)) +
ggplot2::geom_point(ggplot2::aes(colour = rank)) +
ggrepel::geom_label_repel(ggplot2::aes(label = text, colour = rank))
site_html %>%
rvest::html_node(xpath = xpath_ce) %>%
rvest::html_table() %>%
DT::datatable()
site_html %>%
rvest::html_node(xpath = xpath_pa) %>%
rvest::html_table() %>%
DT::datatable()
site_html %>%
rvest::html_node(xpath = xpath_in) %>%
rvest::html_table() %>%
DT::datatable()
site_html %>%
rvest::html_node(xpath = xpath_op) %>%
rvest::html_table() %>%
DT::datatable()
取組一覧のヘッダが“東”と“西”にまとめられているために「ヘッダなし」で読みこまないとその後のデータフレーム化処理でおかしくなる点に注意。
df <- purrr::map2(site_url, c(1:15), ~ paste(.x, .y, sep = "")) %>%
purrr::map2_df(c(1:15), .f = function(.x, .y) {
xml2::read_html(.x) %>%
rvest::xml_node(xpath = xpath_mu) %>%
rvest::html_table(header = FALSE) %>%
dplyr::slice(-1) %>%
dplyr::mutate(day = .y)
})
result <- df %>%
tidyr::extract(X2, "e_name", regex = "([^[:digit:]]+)", remove = FALSE) %>%
tidyr::extract(X6, "w_name", regex = "([^[:digit:]]+)", remove = FALSE) %>%
tidyr::extract(X2, c("e_win", "e_lose"),
regex = "([[:digit:]])勝([[:digit:]])", remove = TRUE) %>%
tidyr::extract(X6, c("w_win", "w_lose"),
regex = "([[:digit:]])勝([[:digit:]])", remove = TRUE) %>%
dplyr::select(day, e_class = X1, e_name, e_win, e_lose, e_mark = X3,
kimarite = X4, w_mark = X5, w_class = X7, w_name, w_win, w_lose)
result %>% DT::datatable()
df <- purrr::map2(site_url, c(1:15), ~ paste(.x, .y, sep = "")) %>%
purrr::map2_df(c(1:15), .f = function(.x, .y) {
xml2::read_html(.x) %>%
rvest::xml_node(xpath = xpath_ju) %>%
rvest::html_table(header = FALSE) %>%
dplyr::slice(-1) %>%
dplyr::mutate(day = .y)})
result <- df %>%
tidyr::extract(X2, "e_name", regex = "([^[:digit:]]+)", remove = FALSE) %>%
tidyr::extract(X6, "w_name", regex = "([^[:digit:]]+)", remove = FALSE) %>%
tidyr::extract(X2, c("e_win", "e_lose"),
regex = "([[:digit:]])勝([[:digit:]])", remove = TRUE) %>%
tidyr::extract(X6, c("w_win", "w_lose"),
regex = "([[:digit:]])勝([[:digit:]])", remove = TRUE) %>%
dplyr::select(day, e_class = X1, e_name, e_win, e_lose, e_mark = X3,
kimarite = X4, w_mark = X5, w_class = X7, w_name, w_win, w_lose)
result %>% DT::datatable()
(df <- purrr::map2(site_url, c(1:344), ~ paste(.x, .y, sep = "")) %>%
purrr::map_df(.f = function(.x, .y) {
xml2::read_html(.x) %>%
rvest::xml_node(xpath = xpath_na) %>%
rvest::html_table()})) %>%
DT::datatable()
25日、75日移動平均線付
na_df <- df
df %>%
dplyr::rename(date = `日付`,
Open = `始値`, High = `高値`, Low = `安値`, Close = `終値`) %>%
dplyr::mutate(date = lubridate::as_date(date)) %>%
dplyr::mutate_if(is.character, readr::parse_number) %>%
dplyr::arrange(date) %>%
dplyr::mutate(MA25 = RcppRoll::roll_meanr(Close, n = 25L, fill = NA),
MA75 = RcppRoll::roll_meanr(Close, n = 75L, fill = NA)) %>%
zoo::read.zoo() %>% xts::as.xts() %>%
dygraphs::dygraph() %>%
dygraphs::dyCandlestick() %>%
dygraphs::dyRangeSelector(dateWindow = c("2018-01-01", "2018-11-30"))
本来は10ページ目まであるが。謎のマルチバイト文字(恐らく前週比の欄にある「-」)が含まれていてエラーになるので9ページ目までとしている。
(yd_df <- purrr::map2(site_url, c(1:9), ~ paste(.x, .y, sep = "")) %>%
purrr::map_df(.f = function(.x) {
xml2::read_html(.x) %>%
rvest::xml_node(xpath = xpath_yd) %>%
rvest::html_table()}) %>%
dplyr::mutate(`日付` = lubridate::as_date(`日付`)) %>%
dplyr::select(-`売買高(株)`)) %>%
DT::datatable()
yd_df %>%
dplyr::select(date = `日付`,
Open = `始値`, High = `高値`, Low = `安値`, Close = `終値`) %>%
dplyr::arrange(date) %>%
dplyr::mutate(MA5 = RcppRoll::roll_meanr(Close, n = 5L, fill = NA),
MA15 = RcppRoll::roll_meanr(Close, n = 15L, fill = NA)) %>%
zoo::read.zoo() %>% xts::as.xts() %>%
dygraphs::dygraph() %>%
dygraphs::dyCandlestick() %>%
dygraphs::dyRangeSelector()
date_loc <- site_html %>%
rvest::html_node(css = css_po) %>%
rvest::html_text()
time_now <- site_html %>%
rvest::html_node(css = css_nw) %>%
rvest::html_text() %>%
stringr::str_remove(pattern = "※")
(amedas_df <- site_html %>%
rvest::html_node(css = css_da) %>%
rvest::html_table(header = TRUE) %>%
dplyr::slice(-1) %>%
dplyr::mutate(`時刻` = readr::parse_integer(`時刻`),
`気温` = readr::parse_double(`気温`),
`降水量` = readr::parse_double(`降水量`),
`風速` = readr::parse_double(`風速`),
`日照時間` = readr::parse_double(`日照時間`),
`積雪深` = readr::parse_double(`積雪深`),
`湿度` = readr::parse_integer(`湿度`),
`気圧` = readr::parse_double(`気圧`))) %>%
DT::datatable(caption = paste(date_loc, time_now))
XPathを利用してスクレイピングを行った際に情報が上手く取得できない場合があります。このような場合、CSS selectorを使ってください。CSS selectorの取得にはFirefoxの開発者ツールが向いています。
取得したデータが文字化けを起こしている場合、まずは、サイトのエンコード指定(文字セット指定)を確認してみてください。UTF-8以外の場合は変換すると文字化けが解消する場合があります。
rvestパッケージにはサイトのエンコードを推測するrvest::guess_encodingがあります。例えば気象庁のページはUTF-8ですが、rvest::guess_encodingにスクレイピングしたいページの情報を丸ごと渡すと以下のような結果を返してくれます。
site_html %>% rvest::guess_encoding()
シフトJISを使い続ける上場企業をまとめてみた を参考にUTF-8のサイトをチェックしてみると
"http://www.nisshin-oillio.com/" %>%
xml2::read_html() %>% rvest::guess_encoding()
"https://www.makita.co.jp/" %>%
xml2::read_html() %>% rvest::guess_encoding()
## Error in doc_parse_raw(x, encoding = encoding, base_url = base_url, as_html = as_html, : input conversion failed due to input error, bytes 0x87 0x6F 0x8D 0x82 [6003]
"https://www.rakuten.co.jp/" %>%
xml2::read_html() %>% rvest::guess_encoding()
サイトによっては文字コードがShift JISなどの場合rvestパッケージで取得した情報が文字化けする場合があります。このような場合、ヘッダーから文字セットの情報を取得してエンコードを変換する必要があります。
の間)にある文字セットのメタ情報(で囲われていてcharsetが記載されいる部分)のCSSセレクタを取得します。
(meta <- site_url %>%
xml2::read_html() %>%
rvest::html_nodes(css = "head > meta:nth-child(1)") %>%
rvest::html_attrs())
## [[1]]
## http-equiv content
## "Content-Type" "text/html; charset=UTF-8"
次に“charset=”の文字列の終了位置と文字セットの文字コード指定の最終文字の位置を取得します。これらの操作にはstringrパッケージを用います。
start <- stringr::str_locate(meta, pattern = "charset=")[2]
end <- stringr::str_locate(meta, pattern = '\\)')[1]
得られた二つの文字位置に囲まれた部分が文字コードになっていますので、位置指定で文字コードを文字列として取り出します。
stringr::str_sub(meta, start = start + 1, end = end - 2)
## [1] "UTF-8"